################################################################################
## Paper:     Working the Crowd
## Authors:   P. Mongrain, N. Fréchet, B. Thompson Collart, and Y. Dufresne
## Date:      February 2025
################################################################################

################################################################################
## IMPORTANT NOTE
################################################################################

# Please run the "merge.do" *before* running the code

################################################################################
## SET WORKING DIRECTORY
################################################################################

getwd()

setwd("C:/...") #add appropriate file path

################################################################################
## LOAD PACKAGES 
################################################################################

#remotes::install_github("NightingaleHealth/ggforestplot")

library(cowplot)
library(devtools)
library(dfoptim)
library(dotwhisker)
library(dplyr) 
library(effects)
library(emmeans)
library(expss)
library(ggalt)
library(ggeasy)
library(ggeffects)
library(ggforestplot)
library(ggplot2)
library(ggpubr)
library(ggridges)
library(grid)
library(gridExtra)
library(haven)
library(interplot)
library(lattice)
library(lme4)
library(lubridate)
library(magrittr)
library(marginaleffects)
library(margins)
library(merTools)
library(prettyR)
library(psych)
library(rms)
library(scales)
library(sjmisc)
library(sjPlot)
library(srvyr)
library(tibble)
library(tidyverse)
library(visreg)

if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, lme4, marginaleffects, optimx, broom.mixed)

################################################################################
## LOAD DATA
################################################################################

## IMPORT DATA FROM STATA

merge <- read_dta("merge.dta")

## CREATE SEPARATE DATAFRAME FOR EACH ELECTION

ca2011.data <- merge %>% filter(election == "ca2011")
ca2015.data <- merge %>% filter(election == "ca2015")
ca2019.data <- merge %>% filter(election == "ca2019")
on2011.data <- merge %>% filter(election == "on2011")
on2014.data <- merge %>% filter(election == "on2014")
qc2022.data <- merge %>% filter(election == "qc2022")

ca2015.data <- filter(ca2015.data, type == "lpp")
ca2015.data <- Filter(function(x)!all(is.na(x)), ca2015.data)

## DEFINE VARIABLES AS FACTOR OR NUMERIC

ca2015.data$correct_district <- as.factor(ca2015.data$correct_district)
ca2015.data$vote_district <- as.factor(ca2015.data$vote_district)
ca2015.data$pidstatus_district <- as.factor(ca2015.data$pidstatus_district)
ca2015.data$univ <- as.factor(ca2015.data$univ)
ca2015.data$male <- as.factor(ca2015.data$male)
ca2015.data$age55 <- as.factor(ca2015.data$age55)
ca2015.data$highinc <- as.factor(ca2015.data$highinc)
ca2015.data$boundary <- as.factor(ca2015.data$boundary)
ca2015.data$margin <- as.numeric(ca2015.data$margin)
ca2015.data$interest_n <- as.numeric(ca2015.data$interest_n)
ca2015.data$time <- as.numeric(ca2015.data$time)
ca2015.data$vote_district <- factor(ca2015.data$vote_district, levels = c(0,1),
                                    labels = c("No", "Yes"))

## CREATE MODE FUNCTION

getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

################################################################################
## RANDOM INTERCEPT
################################################################################

## Estimate model

ca2015.data = ca2015.data %>%
    mutate(margin = (margin - mean(margin)) / sd(margin)) #scaling margin

ca2015.data = ca2015.data %>%
  mutate(time = (time - mean(time)) / sd(time)) #scaling response date

pacman::p_load(lmerTest) #show statistical significance

ca2015.2.ri.fit <- glmer(correct_district ~ 1 + pidstatus_district*interest_n + univ + male + 
                     age55 + highinc + boundary + margin + time +
                     (1 | district_code), weight=survey_weight, 
                     data=ca2015.data, 
                     family=binomial(link="logit"))

summary(ca2015.2.ri.fit, correlation = FALSE)

## Extracting random effects

ranef(ca2015.2.ri.fit, 
      drop = TRUE, #to have them as a vector
      condVar = FALSE)

## Plotting random effects

broom.mixed::tidy(ca2015.2.ri.fit, effects = "ran_vals") %>%
    ggplot(aes(x = reorder(level, -estimate), y = estimate)) +
    geom_hline(yintercept = 0, color = "grey70") +
    geom_point(color = "cyan4") +
    geom_errorbar(aes(ymin = estimate - 1.96*std.error, ymax = estimate + 1.96*std.error), 
                  width = 0, color = "cyan4") +
    labs(x = "District", y = "RE estimate") +
    scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
    theme_classic() +
    coord_flip()

################################################################################
## RANDOM SLOPE
################################################################################

ca2015.2.rs.fit.bob <- glmer(correct_district ~ 1 + pidstatus_district*interest_n + univ + male + 
                    age55 + highinc + margin + time + boundary + 
                    (1 + pidstatus_district*interest_n | district_code), weight=survey_weight,
                    control = glmerControl(optimizer = "bobyqa"),
                    data=ca2015.data, 
                    family=binomial(link="logit"))

summary(ca2015.2.rs.fit.bob, correlation = FALSE)

################################################################################
## PREDICTED PROBABILITIES
################################################################################

ca2015.data <- filter(ca2015.data, !is.na(correct_district))
ca2015.data <- filter(ca2015.data, !is.na(univ))
ca2015.data <- filter(ca2015.data, !is.na(pidstatus_district))
ca2015.data <- filter(ca2015.data, !is.na(interest_n))
ca2015.data <- filter(ca2015.data, !is.na(male))
ca2015.data <- filter(ca2015.data, !is.na(age55))
ca2015.data <- filter(ca2015.data, !is.na(highinc))
ca2015.data <- filter(ca2015.data, !is.na(boundary))
ca2015.data <- filter(ca2015.data, !is.na(margin))
ca2015.data <- filter(ca2015.data, !is.na(time))

ca2015.2.rs.preds.1 <- with(ca2015.data, data.frame(district_code=unique(district_code), 
                                                    univ="0", 
                                                    interest_n=0, 
                                                    pidstatus_district="1",
                                                    male=getmode(male),
                                                    age55=getmode(age55),
                                                    highinc=getmode(highinc),
                                                    boundary=getmode(boundary), 
                                                    margin=0,
                                                    time=0))

ca2015.2.rs.preds.2 <- with(ca2015.data, data.frame(district_code=unique(district_code), 
                                                    univ="1", 
                                                    interest_n=1, 
                                                    pidstatus_district="1",
                                                    male=getmode(male),
                                                    age55=getmode(age55),
                                                    highinc=getmode(highinc),
                                                    boundary=getmode(boundary), 
                                                    margin=0,
                                                    time=0))

ca2015.2.rs.preds.3 <- with(ca2015.data, data.frame(district_code=unique(district_code), 
                                                    univ="0", 
                                                    interest_n=0, 
                                                    pidstatus_district="2",
                                                    male=getmode(male),
                                                    age55=getmode(age55),
                                                    highinc=getmode(highinc),
                                                    boundary=getmode(boundary), 
                                                    margin=0,
                                                    time=0))

ca2015.2.rs.preds.4 <- with(ca2015.data, data.frame(district_code=unique(district_code), 
                                                    univ="1", 
                                                    interest_n=1, 
                                                    pidstatus_district="2",
                                                    male=getmode(male),
                                                    age55=getmode(age55),
                                                    highinc=getmode(highinc),
                                                    boundary=getmode(boundary), 
                                                    margin=0,
                                                    time=0))

ca2015.2.rs.preds.5 <- with(ca2015.data, data.frame(district_code=unique(district_code), 
                                                    univ="0", 
                                                    interest_n=0, 
                                                    pidstatus_district="3",
                                                    male=getmode(male),
                                                    age55=getmode(age55),
                                                    highinc=getmode(highinc),
                                                    boundary=getmode(boundary), 
                                                    margin=0,
                                                    time=0))

ca2015.2.rs.preds.6 <- with(ca2015.data, data.frame(district_code=unique(district_code), 
                                                    univ="1", 
                                                    interest_n=1, 
                                                    pidstatus_district="3",
                                                    male=getmode(male),
                                                    age55=getmode(age55),
                                                    highinc=getmode(highinc),
                                                    boundary=getmode(boundary), 
                                                    margin=0,
                                                    time=0))

ca2015.2.rs.preds.1$preds = predict(ca2015.2.rs.fit.bob, newdata = ca2015.2.rs.preds.1, type = "response")
ca2015.2.rs.preds.2$preds = predict(ca2015.2.rs.fit.bob, newdata = ca2015.2.rs.preds.2, type = "response")
ca2015.2.rs.preds.3$preds = predict(ca2015.2.rs.fit.bob, newdata = ca2015.2.rs.preds.3, type = "response")
ca2015.2.rs.preds.4$preds = predict(ca2015.2.rs.fit.bob, newdata = ca2015.2.rs.preds.4, type = "response")
ca2015.2.rs.preds.5$preds = predict(ca2015.2.rs.fit.bob, newdata = ca2015.2.rs.preds.5, type = "response")
ca2015.2.rs.preds.6$preds = predict(ca2015.2.rs.fit.bob, newdata = ca2015.2.rs.preds.6, type = "response")

ca2015.2.rs.preds.1 <- ca2015.2.rs.preds.1 %>% mutate(pr.type = 1)
ca2015.2.rs.preds.1 <- ca2015.2.rs.preds.1 %>% arrange(preds)
ca2015.2.rs.preds.1$id <- seq(1, nrow(ca2015.2.rs.preds.1))
order.district.loser <- subset(ca2015.2.rs.preds.1, select = c(district_code, id))

ca2015.2.rs.preds.2.2 <- ca2015.2.rs.preds.2 %>% mutate(pr.type = 88)
ca2015.2.rs.preds.2 <- ca2015.2.rs.preds.2 %>% mutate(pr.type = 2)
ca2015.2.rs.preds.2 <- ca2015.2.rs.preds.2 %>% arrange(preds)
ca2015.2.rs.preds.2$id <- seq(1, nrow(ca2015.2.rs.preds.2))

ca2015.2.rs.preds.2.2 <- merge(ca2015.2.rs.preds.2.2, order.district.loser, by="district_code")
ca2015.2.rs.preds.2.2$id <- seq(1, nrow(ca2015.2.rs.preds.2.2))

ca2015.2.rs.preds.3 <- ca2015.2.rs.preds.3 %>% mutate(pr.type = 3)
ca2015.2.rs.preds.3 <- ca2015.2.rs.preds.3 %>% arrange(preds)
ca2015.2.rs.preds.3$id <- seq(1, nrow(ca2015.2.rs.preds.3))
order.district.ind <- subset(ca2015.2.rs.preds.3, select = c(district_code, id))

ca2015.2.rs.preds.4.2 <- ca2015.2.rs.preds.4 %>% mutate(pr.type = 99)
ca2015.2.rs.preds.4 <- ca2015.2.rs.preds.4 %>% mutate(pr.type = 4)
ca2015.2.rs.preds.4 <- ca2015.2.rs.preds.4 %>% arrange(preds)
ca2015.2.rs.preds.4$id <- seq(1, nrow(ca2015.2.rs.preds.4))

ca2015.2.rs.preds.4.2 <- merge(ca2015.2.rs.preds.4.2, order.district.ind, by="district_code")
ca2015.2.rs.preds.4.2$id <- seq(1, nrow(ca2015.2.rs.preds.4.2))

ca2015.2.rs.preds.5 <- ca2015.2.rs.preds.5 %>% mutate(pr.type = 5)
ca2015.2.rs.preds.5 <- ca2015.2.rs.preds.5 %>% arrange(preds)
ca2015.2.rs.preds.5$id <- seq(1, nrow(ca2015.2.rs.preds.5))
order.district.winner <- subset(ca2015.2.rs.preds.5, select = c(district_code, id))

ca2015.2.rs.preds.6.2 <- ca2015.2.rs.preds.6 %>% mutate(pr.type = 999)
ca2015.2.rs.preds.6 <- ca2015.2.rs.preds.6 %>% mutate(pr.type = 6)
ca2015.2.rs.preds.6 <- ca2015.2.rs.preds.6 %>% arrange(preds)
ca2015.2.rs.preds.6$id <- seq(1, nrow(ca2015.2.rs.preds.6))

ca2015.2.rs.preds.6.2 <- merge(ca2015.2.rs.preds.6.2, order.district.winner, by="district_code")
ca2015.2.rs.preds.6.2$id <- seq(1, nrow(ca2015.2.rs.preds.6.2))

ca2015.2.rs.preds.loser <- rbind(ca2015.2.rs.preds.1, ca2015.2.rs.preds.2, ca2015.2.rs.preds.2.2)
ca2015.2.rs.preds.ind <- rbind(ca2015.2.rs.preds.3, ca2015.2.rs.preds.4, ca2015.2.rs.preds.4.2)
ca2015.2.rs.preds.winner <- rbind(ca2015.2.rs.preds.5, ca2015.2.rs.preds.6, ca2015.2.rs.preds.6.2)

ca2015.2.rs.preds.loser$pr.type <- as.factor(ca2015.2.rs.preds.loser$pr.type)
ca2015.2.rs.preds.ind$pr.type <- as.factor(ca2015.2.rs.preds.ind$pr.type)
ca2015.2.rs.preds.winner$pr.type <- as.factor(ca2015.2.rs.preds.winner$pr.type)

################################################################################
## FIGURE 4: CANADA 2015
################################################################################

ca2015.2.rs.preds.loser$title <- "(a) Losers"

ca2015.2.rs.preds.loser.g <- ggplot(ca2015.2.rs.preds.loser, aes(x = reorder(id, -id), 
                    y = preds)) + geom_point(aes(color = pr.type, alpha = pr.type, shape = pr.type), show.legend = FALSE) + 
                    labs(x = "District", y = "Predicted Probability of Correct Forecast") +
                    scale_x_discrete(guide = guide_axis(n.dodge = 2), expand = expansion(mult = c(.02, .02))) +
                    scale_y_continuous("Predicted Probability", limits=c(0,1), breaks = seq(0,1,.25)) +
                    geom_hline(yintercept = 0.5, color = "grey70", linetype = 2) +
                    scale_colour_manual(name = "Education & Interest", values = c("#D55E00","#E69F00","#E69F00"),
                    labels = c("Low", "High", "")) +
                    scale_alpha_manual(values = c(1, 1, .15)) + 
                    scale_shape_manual(values = c(17, 17, 17)) +
                    theme_bw() + coord_flip() + facet_grid(. ~ title)

ca2015.2.rs.preds.loser.g <- ca2015.2.rs.preds.loser.g + theme(panel.grid.major = element_blank(),
                                             panel.grid.minor = element_blank(),
                                             panel.background = element_blank(),
                                             axis.text.y = element_blank(), 
                                             axis.ticks.y = element_blank(),
                                             axis.title.y = element_blank(),
                                             axis.title.x = element_blank(),
                                             axis.text.x = element_text(size = 11, colour="white"),
                                             axis.ticks = element_line(color = "#00000000"),                                             
                                             strip.text = element_text(size = 13),
                                             plot.title = element_text(size = 12, hjust = 0.5))

ca2015.2.rs.preds.loser.g

ca2015.2.rs.preds.ind$title <- "(b) Independents"

ca2015.2.rs.preds.ind.g <- ggplot(ca2015.2.rs.preds.ind, aes(x = reorder(id, -id), 
                     y = preds)) + geom_point(aes(color = pr.type, alpha = pr.type, shape = pr.type), show.legend = FALSE) + 
                     labs(x = "District", y = "Predicted Probability of Correct Forecast") +
                     scale_x_discrete(guide = guide_axis(n.dodge = 2), expand = expansion(mult = c(.02, .02))) +
                     scale_y_continuous("Predicted Probability", limits=c(0,1), breaks = seq(0,1,.25)) +
                     geom_hline(yintercept = 0.5, color = "grey70", linetype = 2) +
                     scale_colour_manual(name = "Education & Interest", values = c("#D55E00","#E69F00","#E69F00"),
                                        labels = c("Low", "High", "")) +
                     scale_alpha_manual(values = c(1, 1, .15)) + 
                     scale_shape_manual(values = c(17, 17, 17)) +
                     theme_bw() + coord_flip() + facet_grid(. ~ title)

ca2015.2.rs.preds.ind.g <- ca2015.2.rs.preds.ind.g + theme(panel.grid.major = element_blank(),
                                            panel.grid.minor = element_blank(),
                                            panel.background = element_blank(),
                                            axis.text.y = element_blank(), 
                                            axis.ticks.y = element_blank(),
                                            axis.title.y = element_blank(),
                                            axis.title.x = element_blank(),
                                            axis.text.x = element_text(size = 11, colour="white"),
                                            axis.ticks = element_line(color = "#00000000"),
                                            strip.text = element_text(size = 13),
                                            plot.title = element_text(size = 12, hjust = 0.5))

ca2015.2.rs.preds.ind.g

ca2015.2.rs.preds.winner$title <- "(c) Winners"

ca2015.2.rs.preds.winner.g <- ggplot(ca2015.2.rs.preds.winner, aes(x = reorder(id, -id), 
                     y = preds)) + geom_point(aes(color = pr.type, alpha = pr.type, shape = pr.type), show.legend = FALSE) + 
                     labs(x = "District", y = "Predicted Probability of Correct Forecast") +
                     scale_x_discrete(guide = guide_axis(n.dodge = 2), expand = expansion(mult = c(.02, .02))) +
                     scale_y_continuous("Predicted Probability", limits=c(0,1), breaks = seq(0,1,.25)) +
                     geom_hline(yintercept = 0.5, color = "grey70", linetype = 2) +
                     scale_colour_manual(name = "Education & Interest", values = c("#D55E00","#E69F00","#E69F00"),
                     labels = c("Low", "High", "")) +
                     scale_alpha_manual(values = c(1, 1, .15)) + 
                     scale_shape_manual(values = c(17, 17, 17)) +
                     theme_bw() + coord_flip() + facet_grid(. ~ title)

ca2015.2.rs.preds.winner.g <- ca2015.2.rs.preds.winner.g + theme(panel.grid.major = element_blank(),
                                               panel.grid.minor = element_blank(),
                                               panel.background = element_blank(),
                                               axis.text.y = element_blank(), 
                                               axis.ticks.y = element_blank(),
                                               axis.title.y = element_blank(),
                                               axis.title.x = element_blank(),
                                               axis.text.x = element_text(size = 11, colour="black"),
                                               axis.ticks = element_line(color = "black"),
                                               strip.text = element_text(size = 13),
                                               plot.title = element_text(size = 12, hjust = 0.5))

ca2015.2.rs.preds.winner.g

tiff("ca2015_2_preds_g.tiff", units="in", width=3.5, height=8, res=300)

ca2015.2.preds.g <- ggarrange(ca2015.2.rs.preds.loser.g,
                           ca2015.2.rs.preds.ind.g,
                           ca2015.2.rs.preds.winner.g,
                           ncol = 1, nrow = 3)

ca2015.2.preds.g <- annotate_figure(ca2015.2.preds.g,
                                 left = textGrob("Electoral District     ", rot = 90, 
                                                 vjust = .6, gp = gpar(col = "black", cex = 1, fontsize = 15)),
                                 top = textGrob("Canada 2015", 
                                                   vjust = .5, gp = gpar(cex = 1, fontsize = 16)))

ca2015.2.preds.g

dev.off()

################################################################################
## RANDOM INTERCEPT VS RANDOM SLOPE
################################################################################

anova(ca2015.2.ri.fit, ca2015.2.rs.fit.bob) #put the simpler model first

################################################################################
## MARGINAL EFFECTS
################################################################################

ca2015.2.rs.univ.pred.prop <- ggeffect(ca2015.2.rs.fit.bob, terms = c("univ"))
ca2015.2.rs.interest.pred.prop <- ggeffect(ca2015.2.rs.fit.bob, terms = c("interest_n"))
ca2015.2.rs.pid.pred.prop <- ggeffect(ca2015.2.rs.fit.bob, terms = c("pidstatus_district"))
ca2015.2.rs.margin.pred.prop <- ggeffect(ca2015.2.rs.fit.bob, terms = c("margin"))
ca2015.2.rs.boundary.pred.prop <- ggeffect(ca2015.2.rs.fit.bob, terms = c("boundary"))
ca2015.2.rs.interest.univ.pred.prop <- ggeffect(ca2015.2.rs.fit.bob, terms = c("interest_n", "univ"))
ca2015.2.rs.int.pred.prop <- ggeffect(ca2015.2.rs.fit.bob, terms = c("interest_n", "pidstatus_district"))

ca2015.2.rs.univ.pred.ref <- ggpredict(ca2015.2.rs.fit.bob, terms = c("univ"))
ca2015.2.rs.interest.pred.ref <- ggpredict(ca2015.2.rs.fit.bob, terms = c("interest_n"))
ca2015.2.rs.pid.pred.ref <- ggpredict(ca2015.2.rs.fit.bob, terms = c("pidstatus_district"))
ca2015.2.rs.margin.pred.ref <- ggpredict(ca2015.2.rs.fit.bob, terms = c("margin"))
ca2015.2.rs.boundary.pred.ref <- ggpredict(ca2015.2.rs.fit.bob, terms = c("boundary"))
ca2015.2.rs.interest.univ.pred.ref <- ggpredict(ca2015.2.rs.fit.bob, terms = c("interest_n", "univ"))
ca2015.2.rs.int.pred.ref <- ggpredict(ca2015.2.rs.fit.bob, terms = c("interest_n", "pidstatus_district"))